#coding=utf-8
import re

import mysql.connector
from bs4 import BeautifulSoup
from pip._vendor import requests

raw_cookies='''bid:y7wIRhBJQOQ;_pk_id.100001.4cf6:bb7a7bb337e78f58.1510647529.2.1510711576.1510647613.;__utma:223695111.493235935.1510647531.1510647531.1510711574.2;__utmz:223695111.1510647531.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none);as:"https://movie.douban.com/subject/6982558/comments?status=P";ps:y;dbcl2:"156074167:1uTNltookr8";push_noty_num:0;push_doumail_num:0;ck:-A5T;_pk_ses.100001.4cf6:*;ap:1;__utmb:223695111.0.10.1510711574;__utmc:223695111'''

# cookies = cookielib.CookieJar()
cookie = {}
for line in raw_cookies.split(';'):
    # print (line.split(':',1))#1代表只分一次，得到两个数据
    key,value = line.split(':',1)
    cookie[key] = value


# cookies.set_cookie(cookie)

#发送请求头，爬取所需要的信息
def getInfo(url):
    page_num = 0
    Infolist = []
    headers = dict()
    headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    headers['Accept-Encoding'] = 'gzip, deflate, br'
    headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
    headers['Connection'] = 'keep-alive'
    headers['Host'] = 'movie.douban.com'
    headers['Cookie'] = 'dbcl2="156074167:1uTNltookr8"; push_noty_num=0; push_doumail_num=0; ck=-A5T; ap=1; __utmc=30149280; __utmc=223695111'
    headers['Referer'] = 'https://www.douban.com/accounts/login?source=movie'
    headers['User-agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'
    headers['Upgrade-Insecure-Requests'] = '1'
    headers['Cache-Control'] = 'max-age=0'

    get_url =url #利用cookie请求访问另一个网址
    # handler = urllib2.HTTPCookieProcessor(cookies)
    # opener = urllib2.build_opener(handler)
    # urllib2.install_opener(opener)
    data = requests.get(get_url,timeout = 20,headers = dict()).text#,cookies = cookie
    # req = urllib2.Request(get_url,headers=headers)
    # response = urllib2.urlopen(req)
    # print (data)

    soup = BeautifulSoup(data,'lxml')

    comments = soup.find_all("div",class_='comment-item')
    '''    
        <h3>
            
            <span class="comment-vote">
                <span class="votes">14331</span>
                <input value="1121710587" type="hidden">
                <a href="javascript:;" class="j a_vote_comment">有用</a>
            </span>
            <span class="comment-info">
                <a href="https://www.douban.com/people/zhangzongqian/" class="">亵渎电影</a>
                    <span>看过</span>
                    <span class="allstar10 rating" title="很差"></span>
                <span class="comment-time " title="2016-12-16 00:49:32">
                    2016-12-16
                </span>
            </span>
        </h3>
        <p class=""> 张艺谋已死！
        </p>
            <a class="js-irrelevant irrelevant" href="javascript:;">这条短评跟影片无关</a>
    <div class="comment-report" style="visibility: hidden;"><a rel="nofollow" href="javascript:void(0)">举报</a></div>
    '''
    for i in comments:
        listinfo = []
        com = i.find('span',class_='comment-info')
        # print(com)
        # print(com.contents[1].text.strip())#strip() 方法用于移除字符串头尾指定的字符（默认为空格）
        listinfo.append(com.contents[1].text.strip())#豆瓣用户名
        listinfo.append(i.find('p').text)  # 评论内容
        rating = i.find('span',class_='rating')#评分
        if rating != None:
            rating = rating['title']
        else:
            rating = '无评分'
        listinfo.append(rating)
        listinfo.append(i.find('span',class_='votes').text)  # 点赞人数
        listinfo.append(i.find('span',class_='comment-time')['title'])#评论时间
        Infolist.append(listinfo)
        break
    # print(data.strip(' '))
    pattern = re.compile(r'a href="(.*?)" data-page="" class="next"')
    # print(pattern)
    links = re.findall(pattern,data)
    # print(links)
    page_num+=1
    return Infolist,links
#链接数据库
def connDB():
    conn=mysql.connector.connect(user='ssm',password='ssm',host='localhost',database='test',charset='utf8mb4')
    cursor= conn.cursor()
    return(conn,cursor)
#断开数据库
def exitCoon(conn,cursor):
    cursor.close()
    conn.close()
#保存到数据库
def saveToMysql(datalist):
    conn,cursor=connDB()

    cursor.execute("select table_name from information_schema.`TABLES` WHERE table_name='douban_movie_changcheng_comment'")
    table=None
    print(cursor.next())
    for t in cursor.next():
        print(t)
        table=t
    print(table)
    if(table==None):
        cursor.execute('create table douban_movie_changcheng_comment '
                       '(id bitint(20) NOT NULL AUTO_INCREMENT,user_name varchar(50),comment_content varchar(3000),rating varchar(20),comment_vote bigint,comment_time varchar(20)'
                       ',primary_key (id))')
    for data in datalist:
        sql='insert into douban_movie_changcheng_comment (user_name,comment_content,rating,comment_vote,comment_time) values (%s,%s,%s,%s,%s)'
        print(sql)
        cursor.execute(sql,[data[0],data[1],data[2],data[3],data[4]])
        conn.commit()
    exitCoon(conn,cursor)

#主函数
if __name__ == '__main__':
    allcomments=[]
    url='https://movie.douban.com/subject/6982558/comments?status=P'
    # Infolist, links = getInfo(url)
    # print(links[0])
    # url = 'https://movie.douban.com/subject/6982558/comments' + re.sub(r'amp;', '', links[0])
    # print(url)
    x=0
    b=True
    while b:
        x=x+1
        print(x)
        Infolist, links = getInfo(url)
        #extend() 函数用于在列表末尾一次性追加另一个序列中的多个值（用新列表扩展原来的列表）
        allcomments.extend(Infolist)
        if(len(links)==0):
            break
        url='https://movie.douban.com/subject/6982558/comments'+re.sub(r'amp;','',links[0])
        # print(url)

    saveToMysql(allcomments)




